F1 Races Results dataset 1950 to 2024¶
importing libraries¶
In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import bar_chart_race as bcr
import os
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
In [2]:
os.getcwd()
Out[2]:
'C:\\Users\\aksha\\Jupyter\\Project'
In [3]:
os.chdir ('D:\$TUDY\F1')
os.getcwd()
Out[3]:
'D:\\$TUDY\\F1'
In [4]:
drivers = pd.read_csv('drivers_updated.csv')
laps = pd.read_csv('fastest_laps_updated.csv')
teams = pd.read_csv('teams_updated.csv')
win = pd.read_csv('winners.csv')
Data Loading & Data Cleaning¶
Drivers¶
In [5]:
drivers.head()
Out[5]:
| Pos | Driver | Nationality | Car | PTS | year | Code | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | Nino Farina | ITA | Alfa Romeo | 30.0 | 1950 | FAR |
| 1 | 2 | Juan Manuel Fangio | ARG | Alfa Romeo | 27.0 | 1950 | FAN |
| 2 | 3 | Luigi Fagioli | ITA | Alfa Romeo | 24.0 | 1950 | FAG |
| 3 | 4 | Louis Rosier | FRA | Talbot-Lago | 13.0 | 1950 | ROS |
| 4 | 5 | Alberto Ascari | ITA | Ferrari | 11.0 | 1950 | ASC |
In [6]:
drivers.describe()
Out[6]:
| PTS | year | |
|---|---|---|
| count | 1661.000000 | 1661.000000 |
| mean | 31.138170 | 1987.124624 |
| std | 60.446033 | 21.849750 |
| min | 0.000000 | 1950.000000 |
| 25% | 3.000000 | 1968.000000 |
| 50% | 9.000000 | 1987.000000 |
| 75% | 32.000000 | 2006.000000 |
| max | 575.000000 | 2024.000000 |
In [7]:
# filling missing values
drivers['Car'].fillna({"Unknown": 0}, inplace=True)
In [8]:
drivers.shape
Out[8]:
(1661, 7)
In [9]:
drivers.shape
Out[9]:
(1661, 7)
In [10]:
drivers.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1661 entries, 0 to 1660 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pos 1661 non-null object 1 Driver 1661 non-null object 2 Nationality 1661 non-null object 3 Car 1650 non-null object 4 PTS 1661 non-null float64 5 year 1661 non-null int64 6 Code 1661 non-null object dtypes: float64(1), int64(1), object(5) memory usage: 91.0+ KB
In [11]:
drivers.isnull().sum()
Out[11]:
Pos 0 Driver 0 Nationality 0 Car 11 PTS 0 year 0 Code 0 dtype: int64
In [12]:
points = drivers.groupby('Driver').agg(total=('PTS', sum)).reset_index()
points = points.sort_values('total', ascending=False).head(20)
points
Out[12]:
| Driver | total | |
|---|---|---|
| 215 | Lewis Hamilton | 4681.5 |
| 348 | Sebastian Vettel | 3098.0 |
| 247 | Max Verstappen | 2755.5 |
| 102 | Fernando Alonso | 2300.0 |
| 210 | Kimi Räikkönen | 1873.0 |
| 266 | Nico Rosberg | 1594.5 |
| 353 | Sergio Perez | 1593.0 |
| 249 | Michael Schumacher | 1566.0 |
| 63 | Daniel Ricciardo | 1322.0 |
| 166 | Jenson Button | 1235.0 |
| 47 | Charles Leclerc | 1212.0 |
| 100 | Felipe Massa | 1167.0 |
| 44 | Carlos Sainz | 1090.5 |
| 382 | Valtteri Bottas | 1081.0 |
| 239 | Mark Webber | 1047.5 |
| 2 | Alain Prost | 768.5 |
| 212 | Lando Norris | 746.0 |
| 383 | Valtteri Bottas | 716.0 |
| 341 | Rubens Barrichello | 658.0 |
| 24 | Ayrton Senna | 610.0 |
Teams¶
In [13]:
teams.head()
Out[13]:
| Pos | Team | PTS | year | |
|---|---|---|---|---|
| 0 | 1 | Vanwall | 48.0 | 1958 |
| 1 | 2 | Ferrari | 40.0 | 1958 |
| 2 | 3 | Cooper Climax | 31.0 | 1958 |
| 3 | 4 | BRM | 18.0 | 1958 |
| 4 | 5 | Maserati | 6.0 | 1958 |
In [14]:
teams.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 695 entries, 0 to 694 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Pos 695 non-null object 1 Team 695 non-null object 2 PTS 695 non-null float64 3 year 695 non-null int64 dtypes: float64(1), int64(1), object(2) memory usage: 21.8+ KB
In [15]:
teams.isnull().sum()
Out[15]:
Pos 0 Team 0 PTS 0 year 0 dtype: int64
In [16]:
team_points = teams.groupby('Team').agg(total=('PTS', sum)).reset_index()
team_points = team_points.sort_values('total', ascending=False).head(20)
team_points
Out[16]:
| Team | total | |
|---|---|---|
| 47 | Ferrari | 9877.0 |
| 111 | Mercedes | 7318.5 |
| 105 | McLaren Mercedes | 4018.0 |
| 134 | Red Bull Racing Renault | 2298.0 |
| 137 | Renault | 1777.0 |
| 131 | Red Bull Racing Honda | 1321.5 |
| 135 | Red Bull Racing TAG Heuer | 1255.0 |
| 175 | Williams Renault | 1200.0 |
| 132 | Red Bull Racing Honda RBPT | 1136.0 |
| 53 | Force India Mercedes | 1039.0 |
| 174 | Williams Mercedes | 867.0 |
| 104 | McLaren Honda | 832.0 |
| 91 | Lotus Renault | 815.0 |
| 133 | Red Bull Racing RBPT | 759.0 |
| 103 | McLaren Ford | 755.0 |
| 86 | Lotus Ford | 745.0 |
| 129 | RBR Renault | 651.5 |
| 160 | Tyrrell Ford | 560.0 |
| 168 | Williams BMW | 506.0 |
| 24 | Benetton Ford | 481.5 |
Winners¶
In [17]:
win.head()
Out[17]:
| Grand Prix | Date | Winner | Car | Laps | Time | Name Code | year | month | hours | minutes | seconds | Unnamed: 12 | Unnamed: 13 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Great Britain | 13-05-1950 | Nino Farina | Alfa Romeo | 70.0 | 13:23.6 | FAR | 1950 | 5 | 2 | 13 | 24 | NaN | NaN |
| 1 | Monaco | 21-05-1950 | Juan Manuel Fangio | Alfa Romeo | 100.0 | 13:18.7 | FAN | 1950 | 5 | 3 | 13 | 19 | NaN | NaN |
| 2 | Indianapolis 500 | 30-05-1950 | Johnnie Parsons | Kurtis Kraft Offenhauser | 138.0 | 46:56.0 | PAR | 1950 | 5 | 2 | 46 | 56 | NaN | NaN |
| 3 | Switzerland | 04-06-1950 | Nino Farina | Alfa Romeo | 42.0 | 02:53.7 | FAR | 1950 | 6 | 2 | 2 | 54 | NaN | NaN |
| 4 | Belgium | 18-06-1950 | Juan Manuel Fangio | Alfa Romeo | 35.0 | 47:26.0 | FAN | 1950 | 6 | 2 | 47 | 26 | NaN | NaN |
In [18]:
win.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1110 entries, 0 to 1109 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Grand Prix 1110 non-null object 1 Date 1110 non-null object 2 Winner 1110 non-null object 3 Car 1110 non-null object 4 Laps 1107 non-null float64 5 Time 1107 non-null object 6 Name Code 1110 non-null object 7 year 1110 non-null int64 8 month 1110 non-null int64 9 hours 1110 non-null int64 10 minutes 1110 non-null int64 11 seconds 1110 non-null int64 12 Unnamed: 12 0 non-null float64 13 Unnamed: 13 1 non-null object dtypes: float64(2), int64(5), object(7) memory usage: 121.5+ KB
In [19]:
win.drop_duplicates(inplace=True)
In [20]:
win.isnull().sum()
Out[20]:
Grand Prix 0 Date 0 Winner 0 Car 0 Laps 3 Time 3 Name Code 0 year 0 month 0 hours 0 minutes 0 seconds 0 Unnamed: 12 1110 Unnamed: 13 1109 dtype: int64
In [21]:
win['Laps'].fillna(0, inplace=True)
win['Time'].fillna('0', inplace=True)
In [22]:
from dateutil.parser import parse
win['Date'] = win['Date'].apply(parse)
In [23]:
win['Total Time in Seconds'] = win['hours'] * 3600 + win['minutes'] * 60 + win['seconds']
win.head()
Out[23]:
| Grand Prix | Date | Winner | Car | Laps | Time | Name Code | year | month | hours | minutes | seconds | Unnamed: 12 | Unnamed: 13 | Total Time in Seconds | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Great Britain | 1950-05-13 | Nino Farina | Alfa Romeo | 70.0 | 13:23.6 | FAR | 1950 | 5 | 2 | 13 | 24 | NaN | NaN | 8004 |
| 1 | Monaco | 1950-05-21 | Juan Manuel Fangio | Alfa Romeo | 100.0 | 13:18.7 | FAN | 1950 | 5 | 3 | 13 | 19 | NaN | NaN | 11599 |
| 2 | Indianapolis 500 | 1950-05-30 | Johnnie Parsons | Kurtis Kraft Offenhauser | 138.0 | 46:56.0 | PAR | 1950 | 5 | 2 | 46 | 56 | NaN | NaN | 10016 |
| 3 | Switzerland | 1950-04-06 | Nino Farina | Alfa Romeo | 42.0 | 02:53.7 | FAR | 1950 | 6 | 2 | 2 | 54 | NaN | NaN | 7374 |
| 4 | Belgium | 1950-06-18 | Juan Manuel Fangio | Alfa Romeo | 35.0 | 47:26.0 | FAN | 1950 | 6 | 2 | 47 | 26 | NaN | NaN | 10046 |
In [24]:
win['Average Time per lap'] = win['Total Time in Seconds']/win['Laps']
win.head()
Out[24]:
| Grand Prix | Date | Winner | Car | Laps | Time | Name Code | year | month | hours | minutes | seconds | Unnamed: 12 | Unnamed: 13 | Total Time in Seconds | Average Time per lap | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Great Britain | 1950-05-13 | Nino Farina | Alfa Romeo | 70.0 | 13:23.6 | FAR | 1950 | 5 | 2 | 13 | 24 | NaN | NaN | 8004 | 114.342857 |
| 1 | Monaco | 1950-05-21 | Juan Manuel Fangio | Alfa Romeo | 100.0 | 13:18.7 | FAN | 1950 | 5 | 3 | 13 | 19 | NaN | NaN | 11599 | 115.990000 |
| 2 | Indianapolis 500 | 1950-05-30 | Johnnie Parsons | Kurtis Kraft Offenhauser | 138.0 | 46:56.0 | PAR | 1950 | 5 | 2 | 46 | 56 | NaN | NaN | 10016 | 72.579710 |
| 3 | Switzerland | 1950-04-06 | Nino Farina | Alfa Romeo | 42.0 | 02:53.7 | FAR | 1950 | 6 | 2 | 2 | 54 | NaN | NaN | 7374 | 175.571429 |
| 4 | Belgium | 1950-06-18 | Juan Manuel Fangio | Alfa Romeo | 35.0 | 47:26.0 | FAN | 1950 | 6 | 2 | 47 | 26 | NaN | NaN | 10046 | 287.028571 |
Racing Nations: A World of Formula 1 Drivers¶
In [25]:
import geopandas as gpd
import plotly.express as px
# Step 2: Count the number of drivers by nationality
driver_counts = drivers['Nationality'].value_counts().reset_index()
driver_counts.columns = ['Nationality', 'Count']
# Step 3: Map nationalities to country names
nationality_to_country = {
'BRA': 'Brazil',
'SWE': 'Sweden',
'GBR': 'United Kingdom',
'ITA': 'Italy',
'FRA': 'France',
'GER': 'Germany',
'USA': 'United States',
'AUS': 'Australia',
'CAN': 'Canada',
'JPN': 'Japan',
'ESP': 'Spain',
'NED': 'Netherlands',
'ARG': 'Argentina',
'FIN': 'Finland',
'AUT': 'Austria',
'NZL': 'New Zealand',
'BEL': 'Belgium',
'SUI': 'Switzerland',
'MEX': 'Mexico',
'RSA': 'South Africa',
'DEN': 'Denmark',
'RUS': 'Russia',
'MON': 'Monaco',
'POL': 'Poland',
'THA': 'Thailand',
'VEN': 'Venezuela',
'COL': 'Colombia',
'IND': 'India',
'POR': 'Portugal',
'IRL': 'Ireland',
'RHO': 'Rhodesia',
'CHN': 'China',
'HUN': 'Hungary',
'CHI': 'Chile',
'MAS': 'Malaysia',
'INA': 'Indonesia',
'RAF': 'French Equatorial Africa'
# Add more mappings as required
}
# Apply the mapping to convert nationalities to country names
driver_counts['Country'] = driver_counts['Nationality'].map(nationality_to_country)
# Load the world map
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# Merge driver counts with the world map
world = world.merge(driver_counts, how="left", left_on="name", right_on="Country")
# Plot the map
fig = px.choropleth(world,
locations='iso_a3',
color='Count',
hover_name='Country',
hover_data=['Count'],
projection='natural earth',
title='Racing Nations: A World of Formula 1 Drivers',
color_continuous_scale='Oranges') # Change the color scale here
# Customize the map style
fig.update_geos(
visible=False,
showcountries=True,
countrycolor="White",
coastlinecolor="White",
showland=True,
landcolor="LightGrey",
showocean=True,
oceancolor="LightBlue",
showlakes=True,
lakecolor="LightBlue",
showrivers=True,
rivercolor="LightBlue"
)
# Add subtitle and data source
fig.update_layout(
title={
'text': "Racing Nations: A World of Formula 1 Drivers",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
annotations=[
dict(
text="Data Source: Official site of Formula1 (https://www.formula1.com/)",
showarrow=False,
xref="paper",
yref="paper",
x=0,
y=-0.1
),
dict(
text="Number of Drivers by Nationality Worldwide",
xref="paper",
yref="paper",
x=0.5,
y=-0.25,
showarrow=False,
font=dict(
family="Arial",
size=12,
color="grey"
)
)
]
)
# Show the plot
fig.show()
Formula 1 Drivers: A Global Tapestry¶
In [26]:
# Group by 'Car' and sum the points for each car
car_performance = drivers.groupby('Car')['PTS'].sum().reset_index()
# Sort the DataFrame by total points in descending order and select top 25 distinct cars
top_25_cars = car_performance.sort_values(by='PTS', ascending=False).head(25)
# Create the scatter plot
fig = px.scatter(top_25_cars,
x='Car',
y='PTS',
hover_data=top_25_cars.columns,
title='Top 25 Distinct Cars Based on Performance',
labels={'PTS': 'Total Points', 'Car': 'Car Name'},
template='plotly_dark')
# Customize marker symbol and size
fig.update_traces(marker=dict(symbol='hexagram', size=12, color='orange', line=dict(width=1, color='black')),
selector=dict(mode='markers'))
# Adjust axis labels and title
fig.update_layout(
xaxis=dict(
title='Car Name',
showgrid=True,
gridcolor='rgba(100, 100, 100, 0.5)', # Darker shade of gray
tickangle=45,
tickfont=dict(size=10, family="Helvetica, Arial, sans-serif")
),
yaxis=dict(
title='Total Points',
showgrid=True,
gridcolor='rgba(100, 100, 100, 0.5)' # Darker shade of gray
),
title={
'text': "The Pursuit of Victory: Top 25 Cars Based on Performance",
'y':0.95,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
annotations=[
dict(
text="Visualization of total points scored by top 25 distinct cars",
showarrow=False,
xref="paper",
yref="paper",
x=0,
y=-10
)
]
)
# Adjust margin and spacing
fig.update_layout(
margin=dict(l=50, r=50, t=100, b=50),
height=600,
width=1100,
font=dict(family="Helvetica, Arial, sans-serif", size=12)
)
# Show the plot
fig.show()
Data Visualization¶
In [27]:
fig = px.histogram(points, x='Driver',y='total',
color='total')
fig.update_layout(
title='Drivers score ranking',
xaxis_title='Drivers',
yaxis_title='Total',
font={'color':'Black', 'size':18,'family':'Courrier New'}
)
fig.show()
In [28]:
fig2 = px.histogram(team_points, x='Team',y='total',
color='total')
fig2.update_layout(
title='Teams score ranking',
xaxis_title='Teams',
yaxis_title='Total',
font={'color':'Black', 'size':18,'family':'Courrier New'}
)
fig2.show()
In [29]:
leaderboard = drivers.groupby('Driver').sum('PTS').sort_values(by='PTS', ascending=False).reset_index()
In [30]:
fig = px.bar(leaderboard.head(20), x='Driver', y='PTS', color='PTS', title='Leaderboard most career points', height=700)
fig.show()
In [31]:
substring = 'Red Bull'
rb = (teams[teams['Team'].str.contains(substring)])
for Team in rb:
dict = {'Pos' : '', 'Team' : 'Red Bull', 'PTS' : rb['PTS'].sum()}
teams = teams[teams['Team'].str.contains(substring)==False]
teams = teams._append(dict, ignore_index=True)
williams = 'Williams'
w = teams[teams['Team'].str.contains(williams)]
for Team in w:
dict = {'Pos' : '', 'Team' : 'Williams', 'PTS' : w['PTS'].sum()}
teams = teams[teams['Team'].str.contains(williams)==False]
teams = teams._append(dict, ignore_index=True)
mclaren = 'McLaren'
mc = teams[teams['Team'].str.contains(mclaren)]
for Team in mc:
dict = {'Pos' : '', 'Team' : 'McLaren', 'PTS' : mc['PTS'].sum()}
teams = teams[teams['Team'].str.contains(mclaren)==False]
teams = teams._append(dict, ignore_index=True)
leaderboard_tm = teams.groupby('Team').sum('PTS').sort_values(by='PTS', ascending=False).reset_index().head(5)
fig = px.bar(leaderboard_tm, x='Team', y='PTS', color='PTS', title='Leaderboard most Team points', height=700)
fig.show()
In [32]:
leaderboard_win = win.groupby('Winner').size().reset_index(name='Count').sort_values(by='Count', ascending=False).head(10)
fig = px.bar(leaderboard_win, x='Winner', y='Count', color='Count', title='Leaderboard most GP wins by Driver', height=700)
fig.update_yaxes(title_text='GP wins')
fig.show()
Nationalities - Drivers¶
In [34]:
plt.figure(figsize=(10, 6))
drivers['Nationality'].value_counts().plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of drivers nationalities')
plt.xlabel('Nationalities')
plt.ylabel('Number of drivers')
plt.show()
Visualizing the Data of Winners¶
In [35]:
plt.figure(figsize=(20,10))
sns.countplot(data = win, x = 'Grand Prix', order = win['Grand Prix'].value_counts().index)
plt.xticks(rotation = 90)
plt.title('Most Grandprix hosted')
plt.ylabel('Number of races')
plt.xlabel('Grand Prix')
plt.show()
In [36]:
years = win['year'].value_counts()
plt.figure(figsize=(20,10))
sns.lineplot(years)
plt.title('Number of races each year')
plt.xlabel('Years')
plt.ylabel('Number of races')
plt.show()
In [37]:
plt.figure(figsize=(20,10))
sns.countplot(data = win, x = 'Winner', order=win['Winner'].value_counts().index)
plt.xticks(rotation = 90)
plt.title('Number of races won by racers')
plt.ylabel('Number of races won')
plt.xlabel('Winner name')
plt.show()
In [38]:
plt.figure(figsize=(20,10))
sns.histplot(data = win, x = 'Average Time per lap', bins = 30)
plt.title('Distribution of average time per lap')
plt.ylabel('Number of races')
plt.xlabel('Seconds')
plt.show()
In [39]:
top_10_grandprix_by_laps_time =win.groupby('Grand Prix')['Average Time per lap'].mean().sort_values(ascending = False).head(10)
top_10_grandprix_by_laps_time
Out[39]:
Grand Prix Pescara 597.944444 Germany 250.361329 Switzerland 155.911351 Belgium 151.823593 Morocco 146.320755 Tuscany 141.949153 South Korea 127.118182 Azerbaijan 125.480392 Singapore 116.930485 Saudi Arabia 111.760000 Name: Average Time per lap, dtype: float64
In [40]:
plt.figure(figsize=(20,10))
sns.barplot(x = top_10_grandprix_by_laps_time.index, y = top_10_grandprix_by_laps_time.values)
plt.xticks(rotation = 90)
plt.title('Top 10 Grand Prix by average lap time')
plt.ylabel('Average lap time')
plt.xlabel('Grand Prix')
plt.show()